import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import scipy.stats as stats
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
%matplotlib inline
#to make the interactive maps
import folium
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
import geopandas as gpd
from branca.colormap import LinearColormap
# Import necessary libraries
import plotly.graph_objs as go
import chart_studio.plotly as py
from plotly.offline import iplot, init_notebook_mode
import cufflinks as cf
# Initialize Plotly for offline use
init_notebook_mode(connected=True)
#text mining
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
Data Understanding
path = r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\listings.csv'
pathL = r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\listings_detailed.csv.gz'
pathT =r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\Sydney_Touristic-Places.csv'
listings = pd.read_csv(path, index_col= "id")
listings_details = pd.read_csv(pathL, index_col= "id", low_memory=False)
file_path = 'C:/Users/prisi/Documents/2021/UNI/ws23/IU/Data Analytics and Big Data/Sydney, New South Wales, Australia - Airbnb/calendar.csv.gz'
calendar = pd.read_csv(file_path, compression='gzip', parse_dates=['date'], index_col=['listing_id'])
pathR = r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\reviews.csv'
pathRD = r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\reviews_detailed.csv'
reviews = pd.read_csv(pathR, parse_dates=['date'], index_col=['listing_id'])
reviews_details = pd.read_csv(pathRD, parse_dates=['date']) #, index_col=['id'])
pathLGA = r'C:\Users\prisi\Documents\2021\UNI\ws23\IU\Data Analytics and Big Data\Sydney, New South Wales, Australia - Airbnb\LGA_trends.xlsx'
lga = pd.read_excel(pathLGA)
tourist = pd.read_csv(pathT)
C:\Users\prisi\AppData\Local\Temp\ipykernel_328\3959433266.py:9: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False. C:\Users\prisi\AppData\Roaming\Python\Python312\site-packages\openpyxl\worksheet\header_footer.py:48: UserWarning: Cannot parse header or footer so it will be ignored
lga.columns
Index(['Local Government Area', 'Offence type', 'Jan - Dec 2019',
'Jan - Dec 2020', 'Jan - Dec 2021', 'Jan - Dec 2022', 'Jan - Dec 2023',
'Rate per 100,000 population Jan - Dec 2023',
'LGA Rank \n Jan - Dec 2023',
'2 year trend and annual percent change (Jan 2022-Dec 2023)',
'5 year trend and average annual percent change (Jan 2019-Dec 2023)'],
dtype='object')
lga.head()
| Local Government Area | Offence type | Jan - Dec 2019 | Jan - Dec 2020 | Jan - Dec 2021 | Jan - Dec 2022 | Jan - Dec 2023 | Rate per 100,000 population Jan - Dec 2023 | LGA Rank \n Jan - Dec 2023 | 2 year trend and annual percent change (Jan 2022-Dec 2023) | 5 year trend and average annual percent change (Jan 2019-Dec 2023) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albury | Murder * | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.8 | 14 | nc | nc |
| 1 | Albury | Attempted murder | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | nc | nc | nc | nc |
| 2 | Albury | Murder accessory, conspiracy | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | nc | nc | nc | nc |
| 3 | Albury | Manslaughter * | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | nc | nc | nc | nc |
| 4 | Albury | Domestic violence related assault | 297.0 | 328.0 | 405.0 | 408.0 | 418.0 | 737.7 | 33 | Stable | Up 8.9% |
lga["Offence type"].unique()
array(['Murder *', 'Attempted murder', 'Murder accessory, conspiracy',
'Manslaughter *', 'Domestic violence related assault',
'Non-domestic violence related assault', 'Assault Police',
'Sexual assault',
'Sexual touching, sexual act and other sexual offences',
'Abduction and kidnapping', 'Robbery without a weapon',
'Robbery with a firearm', 'Robbery with a weapon not a firearm',
'Blackmail and extortion', 'Intimidation, stalking and harassment',
'Other offences against the person', 'Break and enter dwelling',
'Break and enter non-dwelling',
'Receiving or handling stolen goods', 'Motor vehicle theft',
'Steal from motor vehicle', 'Steal from retail store',
'Steal from dwelling', 'Steal from person', 'Stock theft', 'Fraud',
'Other theft', 'Arson', 'Malicious damage to property',
'Possession and/or use of cocaine',
'Possession and/or use of narcotics',
'Possession and/or use of cannabis',
'Possession and/or use of amphetamines',
'Possession and/or use of ecstasy',
'Possession and/or use of other drugs',
'Dealing, trafficking in cocaine',
'Dealing, trafficking in narcotics',
'Dealing, trafficking in cannabis',
'Dealing, trafficking in amphetamines',
'Dealing, trafficking in ecstasy',
'Dealing, trafficking in other drugs', 'Cultivating cannabis',
'Manufacture drug', 'Importing drugs', 'Other drug offences',
'Prohibited and regulated weapons offences', 'Trespass',
'Offensive conduct', 'Offensive language', 'Criminal intent',
'Betting and gaming offences', 'Liquor offences',
'Pornography offences', 'Prostitution offences', 'Escape custody',
'Breach Apprehended Violence Order', 'Breach bail conditions',
'Fail to appear', 'Resist or hinder officer',
'Other offences against justice procedures',
'Transport regulatory offences', 'Other offences', nan],
dtype=object)
lga["Local Government Area"].unique()
array(['Albury', 'Armidale Regional', 'Ballina', 'Balranald',
'Bathurst Regional', 'Bayside', 'Bega Valley', 'Bellingen',
'Berrigan', 'Blacktown', 'Bland', 'Blayney', 'Blue Mountains',
'Bogan', 'Bourke', 'Brewarrina', 'Broken Hill', 'Burwood', 'Byron',
'Cabonne', 'Camden', 'Campbelltown', 'Canada Bay',
'Canterbury-Bankstown', 'Carrathool', 'Central Coast',
'Central Darling', 'Cessnock', 'Clarence Valley', 'Cobar',
'Coffs Harbour', 'Coolamon', 'Coonamble', 'Cootamundra-Gundagai',
'Cowra', 'Cumberland', 'Dubbo Regional', 'Dungog', 'Edward River',
'Eurobodalla', 'Fairfield', 'Federation', 'Forbes',
'Georges River', 'Gilgandra', 'Glen Innes Severn',
'Goulburn Mulwaree', 'Greater Hume Shire', 'Griffith', 'Gunnedah',
'Gwydir', 'Hawkesbury', 'Hay', 'Hilltops', 'Hornsby',
'Hunters Hill', 'Inner West', 'Inverell', 'Junee', 'Kempsey',
'Kiama', 'Ku-ring-gai', 'Kyogle', 'Lachlan', 'Lake Macquarie',
'Lane Cove', 'Leeton', 'Lismore', 'Lithgow', 'Liverpool',
'Liverpool Plains', 'Lockhart', 'Lord Howe Island', 'Maitland',
'Mid-Coast', 'Mid-Western Regional', 'Moree Plains', 'Mosman',
'Murray River', 'Murrumbidgee', 'Muswellbrook', 'Nambucca Valley',
'Narrabri', 'Narrandera', 'Narromine', 'Newcastle', 'North Sydney',
'Northern Beaches', 'Oberon', 'Orange', 'Parkes', 'Parramatta',
'Penrith', 'Port Macquarie-Hastings', 'Port Stephens',
'Queanbeyan-Palerang Regional', 'Randwick', 'Richmond Valley',
'Ryde', 'Shellharbour', 'Shoalhaven', 'Singleton',
'Snowy Monaro Regional', 'Snowy Valleys', 'Strathfield',
'Sutherland Shire', 'Sydney', 'Tamworth Regional', 'Temora',
'Tenterfield', 'The Hills Shire', 'Tweed',
'Unincorporated Far West', 'Upper Hunter Shire',
'Upper Lachlan Shire', 'Uralla', 'Wagga Wagga', 'Walcha',
'Walgett', 'Warren', 'Warrumbungle Shire', 'Waverley', 'Weddin',
'Wentworth', 'Willoughby', 'Wingecarribee', 'Wollondilly',
'Wollongong', 'Woollahra', 'Yass Valley', 'In Custody', nan,
"^ LGAs with populations lower than 3000 are excluded (indicated by 'nc') because rate calculations for these areas are very sensitive to small changes in population sizes and the number of incidents recorded. Rate calculations should also be treated very cautiously for LGAs that have high visitor numbers relative to their residential population. This is because rate calculations are based on estimated residential population and no adjustment has been made for the number of people visiting each LGA per year. For the rate calculations, specialised population data were prepared and provided to BOCSAR by the Australian Bureau of Statistics.",
'^^ Sydney LGAs, In Custody, LGAs with population less than 3,000 and LGAs with zero incidents for the offence type are NOT assigned LGA ranks.',
"^^^ Shows the results of a statistical test for a significant upward or downward trend in the monthly number of criminal incidents recorded. Where the trend is significant (i.e p<0.05) the average annual percentage change over the five and two year period is shown. Significant upward trends are highlighted in red; significant downward trends are highlighted in yellow. 'Stable' indicates there was no significant upward or downward trend and 'nc' indicates that the number of incidents recorded was too small for a reliable trend test to be performed.",
'Source: NSW Bureau of Crime Statistics and Research, reference LGA_trends23Q4.',
'Please retain this reference number for future correspondence.',
'NOTE: Data sourced from the NSW Bureau of Crime Statistics and Research must be acknowledged in any document (electronic or otherwise) containing that data.',
"The acknowledgement should take the form of 'Source: NSW Bureau of Crime Statistics and Research'.",
"Our 'Using crime statistics' webpage includes information on a revised measure of Aboriginality."],
dtype=object)
Crime rate
import matplotlib.dates as mdates
lgas_of_interest = ['Sydney', 'Waverley', 'Randwick', 'Sutherland Shire', 'Woollahra', 'North Sydney']
offense_types = [
'Sexual assault', 'Sexual touching, sexual act and other sexual offences',
'Robbery without a weapon', 'Robbery with a firearm',
'Robbery with a weapon not a firearm', 'Blackmail and extortion',
'Intimidation, stalking and harassment', 'Malicious damage to property'
]
filtered_crime_data = lga[
(lga['Offence type'].isin(offense_types)) &
(lga['Local Government Area'].isin(lgas_of_interest))
]
melted_crime_data = pd.melt(filtered_crime_data, id_vars=['Local Government Area', 'Offence type'],
value_vars=['Jan - Dec 2019', 'Jan - Dec 2020', 'Jan - Dec 2021',
'Jan - Dec 2022', 'Jan - Dec 2023'],
var_name='Year', value_name='Incidents')
# Convert 'Year' to datetime
melted_crime_data['Year'] = pd.to_datetime(melted_crime_data['Year'].str[-4:])
#pivot_crime_data = melted_crime_data.pivot_table(index='Year', columns=['Local Government Area', 'Offence type'], values='Incidents', aggfunc='sum')
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(16, 24), sharex=True, sharey=True)
axes = axes.flatten()
for index, lga in enumerate(lgas_of_interest):
ax = axes[index]
lga_data = melted_crime_data[melted_crime_data['Local Government Area'] == lga]
pivot_lga_data = lga_data.pivot_table(index='Year', columns='Offence type', values='Incidents', aggfunc='sum')
for offense in offense_types:
if offense in pivot_lga_data.columns:
series = pivot_lga_data[offense]
ax.plot(series.index, series, marker='o', linestyle='-', label=offense)
# Annotate each data point with its value
for i, value in enumerate(series):
ax.text(series.index[i], value, f'{int(value)}', ha='center', va='bottom')
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.set_title(f"Incidents in {lga}", fontsize=15)
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Number of Incidents", fontsize=12)
ax.tick_params(axis='x', rotation=45)
ax.grid(True)
# Adjust the layout to make space for the legend below the plots
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# Collect handles and labels for the global legend
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(offense_types), title='Offence Type')
# Display the plots
plt.show()
#TOURIST PLACES
print("Tourist List shapes and columns:")
print(tourist.columns)
print(tourist.shape)
tourist
Tourist List shapes and columns: Index(['tid', 'Tourist Place', 'Latitude', 'Longitude'], dtype='object') (18, 4)
| tid | Tourist Place | Latitude | Longitude | |
|---|---|---|---|---|
| 0 | 1 | Sydney Opera House | -33.856784 | 151.215293 |
| 1 | 2 | Sydney Harbour Bridge | -33.852222 | 151.210833 |
| 2 | 3 | Bondi Beach | -33.890542 | 151.274856 |
| 3 | 4 | Royal Botanic Garden Sydney | -33.866534 | 151.218345 |
| 4 | 5 | Taronga Zoo | -33.843235 | 151.240640 |
| 5 | 6 | Darling Harbour | -33.874886 | 151.200775 |
| 6 | 7 | The Rocks | -33.859035 | 151.208023 |
| 7 | 8 | Manly Beach | -33.800383 | 151.287743 |
| 8 | 9 | Coogee Beach | -33.923126 | 151.259129 |
| 9 | 10 | Bronte Beach | -33.905618 | 151.265063 |
| 10 | 11 | Art Gallery of New South Wales | -33.868500 | 151.222300 |
| 11 | 12 | Barangaroo Reserve | -33.858400 | 151.201000 |
| 12 | 13 | Museum of Contemporary Art (MCA) | -33.860000 | 151.209100 |
| 13 | 14 | Paddy's Markets | -33.878500 | 151.203000 |
| 14 | 15 | Luna Park | -33.848300 | 151.213900 |
| 15 | 16 | Cockatoo Island | -33.851600 | 151.172000 |
| 16 | 17 | Circular Quay | -33.860000 | 151.209100 |
| 17 | 18 | Chinatown | -33.878500 | 151.203000 |
NSWTrends.columns
Index(['NSW Recorded Crime Statistics January 2019-December 2023',
'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'],
dtype='object')
NSWTrends.head()
| NSW Recorded Crime Statistics January 2019-December 2023 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Incidents of crime recorded by the NSW Police ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | Offence type | Jan - Dec 2019 | Jan - Dec 2020 | Jan - Dec 2021 | Jan - Dec 2022 | Jan - Dec 2023 | Rate per 100,000 population Jan - Dec 2023 | 2 year trend and annual percent change (Jan 20... | 5 year trend and average annual percent change... |
| 3 | Murder * | 73 | 69 | 52 | 59 | 56 | 0.7 | Stable | Down 6.4% |
| 4 | Attempted murder | 26 | 27 | 19 | 13 | 14 | 0.2 | nc | nc |
print("Listings shapes and columns:")
print(listings.columns)
print(listings.shape)
listings.head()
Listings shapes and columns:
Index(['name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood',
'latitude', 'longitude', 'room_type', 'price', 'minimum_nights',
'number_of_reviews', 'last_review', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365',
'number_of_reviews_ltm', 'license'],
dtype='object')
(14274, 17)
| name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||
| 1050164598350388320 | Self service English breakfast | 211218678 | Paul | NaN | Botany Bay | -33.924060 | 151.197320 | Private room | 100.0 | 1 | 0 | NaN | NaN | 1 | 268 | 0 | PID-STRA-61891 |
| 1096082243192434668 | Apartment in Rushcutters Bay | 26721338 | Anne | NaN | Sydney | -33.874830 | 151.225800 | Entire home/apt | 250.0 | 1 | 0 | NaN | NaN | 2 | 228 | 0 | Exempt |
| 2167538 | Modern Sydney home near beach,shops | 11059668 | Sophie | NaN | Waverley | -33.895820 | 151.245490 | Entire home/apt | 650.0 | 90 | 1 | 2015-04-01 | 0.01 | 1 | 365 | 0 | NaN |
| 790607305500629830 | Luxury double room | 202112801 | Vivian | NaN | Canada Bay | -33.851626 | 151.086324 | Private room | 84.0 | 1 | 31 | 2024-03-07 | 2.13 | 30 | 365 | 26 | PID-STRA-21723 |
| 641106357139201992 | Chic Studio Apartment in Prime Bondi Location | 279001183 | MadeComfy | NaN | Waverley | -33.890550 | 151.275230 | Entire home/apt | 508.0 | 1 | 17 | 2023-02-19 | 0.84 | 217 | 0 | 0 | PID-STRA-36651 |
print("listings in details")
print(listings_details.columns)
print(listings_details.shape)
listings_details.head()
listings in details
Index(['listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
'description', 'neighborhood_overview', 'picture_url', 'host_id',
'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_acceptance_rate',
'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
'host_neighbourhood', 'host_listings_count',
'host_total_listings_count', 'host_verifications',
'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
'maximum_minimum_nights', 'minimum_maximum_nights',
'maximum_maximum_nights', 'minimum_nights_avg_ntm',
'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'calendar_last_scraped', 'number_of_reviews',
'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'license', 'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
dtype='object')
(14274, 74)
| listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | host_url | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 1050164598350388320 | https://www.airbnb.com/rooms/1050164598350388320 | 20240316023820 | 2024-03-18 | city scrape | Self service English breakfast | Close to transport airport older style unit ne... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 211218678 | https://www.airbnb.com/users/show/211218678 | ... | NaN | NaN | NaN | PID-STRA-61891 | f | 1 | 0 | 1 | 0 | NaN |
| 1096082243192434668 | https://www.airbnb.com/rooms/1096082243192434668 | 20240316023820 | 2024-03-18 | city scrape | Apartment in Rushcutters Bay | Full 2 bedroom apartment with balcony. Walking... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 26721338 | https://www.airbnb.com/users/show/26721338 | ... | NaN | NaN | NaN | Exempt | f | 2 | 1 | 1 | 0 | NaN |
| 2167538 | https://www.airbnb.com/rooms/2167538 | 20240316023820 | 2024-03-18 | city scrape | Modern Sydney home near beach,shops | **Availability extremely flexible**<br /><br /... | Queens Park is a safe and friendly neighbourho... | https://a0.muscache.com/pictures/29922910/2399... | 11059668 | https://www.airbnb.com/users/show/11059668 | ... | 5.00 | 5.00 | 4.00 | NaN | f | 1 | 1 | 0 | 0 | 0.01 |
| 790607305500629830 | https://www.airbnb.com/rooms/790607305500629830 | 20240316023820 | 2024-03-18 | city scrape | Luxury double room | Relax in this tranquil and stylish space. | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 202112801 | https://www.airbnb.com/users/show/202112801 | ... | 4.77 | 4.68 | 4.68 | PID-STRA-21723 | f | 30 | 5 | 25 | 0 | 2.13 |
| 641106357139201992 | https://www.airbnb.com/rooms/641106357139201992 | 20240316023820 | 2024-03-18 | city scrape | Chic Studio Apartment in Prime Bondi Location | Immerse yourself in the vibrant atmosphere of ... | Bondi is one of Australia’s most iconic suburb... | https://a0.muscache.com/pictures/prohost-api/H... | 279001183 | https://www.airbnb.com/users/show/279001183 | ... | 3.94 | 4.94 | 4.29 | PID-STRA-36651 | f | 217 | 217 | 0 | 0 | 0.84 |
5 rows × 74 columns
print("reviews shapes and columns:")
print(reviews.columns)
print(reviews.shape)
reviews.head()
reviews shapes and columns: Index(['date'], dtype='object') (499595, 1)
| date | |
|---|---|
| listing_id | |
| 11156 | 2009-12-05 |
| 11156 | 2010-03-31 |
| 11156 | 2010-05-14 |
| 11156 | 2010-11-17 |
| 11156 | 2010-12-11 |
print("reviews in details")
print(reviews_details.columns)
print(reviews_details.shape)
reviews_details.head()
reviews in details Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object') (499595, 6)
| listing_id | id | date | reviewer_id | reviewer_name | comments | |
|---|---|---|---|---|---|---|
| 0 | 11156 | 19220 | 2009-12-05 | 52946 | Jeff | Colleen was friendly and very helpful regardin... |
| 1 | 11156 | 32592 | 2010-03-31 | 99382 | Michael | Great place, centrally located, easy walk to t... |
| 2 | 11156 | 42280 | 2010-05-14 | 105797 | Marina | Colleen is very friendly and helpful. The apar... |
| 3 | 11156 | 140942 | 2010-11-17 | 259213 | Sigrid | Dear Colleen!\r<br/>My friend Diemut from Germ... |
| 4 | 11156 | 151966 | 2010-12-11 | 273389 | Eduardo | Lovely, interesting chats and very helpful giv... |
list_a = ['listing_url', 'scrape_id', 'last_scraped', 'source', 'name','description', 'neighborhood_overview', 'picture_url', 'host_id','host_url', 'host_name', 'host_since',
'host_location', 'host_about','host_response_time', 'host_response_rate', 'host_acceptance_rate','host_is_superhost', 'host_thumbnail_url', 'host_picture_url','host_neighbourhood',
'host_listings_count','host_total_listings_count', 'host_verifications','host_has_profile_pic', 'host_identity_verified', 'neighbourhood','neighbourhood_cleansed', 'neighbourhood_group_cleansed',
'latitude','longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms','bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price','minimum_nights', 'maximum_nights',
'minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights','maximum_maximum_nights', 'minimum_nights_avg_ntm','maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
'availability_30', 'availability_60', 'availability_90','availability_365', 'calendar_last_scraped', 'number_of_reviews','number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy','review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location',
'review_scores_value', 'license', 'instant_bookable','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month']
list_b = ['name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood','latitude', 'longitude', 'room_type', 'price',
'minimum_nights','number_of_reviews', 'last_review', 'reviews_per_month','calculated_host_listings_count', 'availability_365','number_of_reviews_ltm', 'license']
set_a = set(list_a)
set_b = set(list_b)
new_list = list(set_a - set_b)
print(new_list)
['neighbourhood_cleansed', 'property_type', 'host_picture_url', 'maximum_minimum_nights', 'host_since', 'last_scraped', 'maximum_maximum_nights', 'calculated_host_listings_count_private_rooms', 'availability_60', 'host_is_superhost', 'amenities', 'host_response_time', 'review_scores_value', 'review_scores_checkin', 'availability_30', 'review_scores_accuracy', 'host_listings_count', 'neighbourhood_group_cleansed', 'host_total_listings_count', 'maximum_nights_avg_ntm', 'instant_bookable', 'first_review', 'minimum_minimum_nights', 'calculated_host_listings_count_shared_rooms', 'calendar_last_scraped', 'review_scores_rating', 'has_availability', 'minimum_maximum_nights', 'minimum_nights_avg_ntm', 'scrape_id', 'review_scores_location', 'host_neighbourhood', 'description', 'host_identity_verified', 'review_scores_communication', 'calendar_updated', 'host_thumbnail_url', 'host_response_rate', 'number_of_reviews_l30d', 'review_scores_cleanliness', 'source', 'accommodates', 'host_about', 'host_acceptance_rate', 'host_has_profile_pic', 'host_location', 'bathrooms_text', 'picture_url', 'bathrooms', 'calculated_host_listings_count_entire_homes', 'host_verifications', 'beds', 'bedrooms', 'host_url', 'availability_90', 'maximum_nights', 'listing_url', 'neighborhood_overview']
MERGE LISTING AND LISTINGS_IN_DETAILS
listings = pd.merge(listings, listings_details[new_list], on='id', how='left')
listings.columns
Index(['name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood',
'latitude', 'longitude', 'room_type', 'price', 'minimum_nights',
'number_of_reviews', 'last_review', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365',
'number_of_reviews_ltm', 'license', 'neighbourhood_cleansed',
'property_type', 'host_picture_url', 'maximum_minimum_nights',
'host_since', 'last_scraped', 'maximum_maximum_nights',
'calculated_host_listings_count_private_rooms', 'availability_60',
'host_is_superhost', 'amenities', 'host_response_time',
'review_scores_value', 'review_scores_checkin', 'availability_30',
'review_scores_accuracy', 'host_listings_count',
'neighbourhood_group_cleansed', 'host_total_listings_count',
'maximum_nights_avg_ntm', 'instant_bookable', 'first_review',
'minimum_minimum_nights', 'calculated_host_listings_count_shared_rooms',
'calendar_last_scraped', 'review_scores_rating', 'has_availability',
'minimum_maximum_nights', 'minimum_nights_avg_ntm', 'scrape_id',
'review_scores_location', 'host_neighbourhood', 'description',
'host_identity_verified', 'review_scores_communication',
'calendar_updated', 'host_thumbnail_url', 'host_response_rate',
'number_of_reviews_l30d', 'review_scores_cleanliness', 'source',
'accommodates', 'host_about', 'host_acceptance_rate',
'host_has_profile_pic', 'host_location', 'bathrooms_text',
'picture_url', 'bathrooms',
'calculated_host_listings_count_entire_homes', 'host_verifications',
'beds', 'bedrooms', 'host_url', 'availability_90', 'maximum_nights',
'listing_url', 'neighborhood_overview'],
dtype='object')
#drop the missing values of the neighbourhood groups
listings = listings.drop(columns=['neighbourhood_group'])
listings['host_response_rate'] = pd.to_numeric(listings['host_response_rate'].str.strip('%'))
listings.head()
| name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | ... | bathrooms | calculated_host_listings_count_entire_homes | host_verifications | beds | bedrooms | host_url | availability_90 | maximum_nights | listing_url | neighborhood_overview | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 1050164598350388320 | Self service English breakfast | 211218678 | Paul | Botany Bay | -33.924060 | 151.197320 | Private room | 100.0 | 1 | 0 | ... | 0.5 | 0 | ['email', 'phone'] | 1.0 | 1.0 | https://www.airbnb.com/users/show/211218678 | 89 | 365 | https://www.airbnb.com/rooms/1050164598350388320 | NaN |
| 1096082243192434668 | Apartment in Rushcutters Bay | 26721338 | Anne | Sydney | -33.874830 | 151.225800 | Entire home/apt | 250.0 | 1 | 0 | ... | 1.0 | 1 | ['email', 'phone'] | 2.0 | 2.0 | https://www.airbnb.com/users/show/26721338 | 48 | 365 | https://www.airbnb.com/rooms/1096082243192434668 | NaN |
| 2167538 | Modern Sydney home near beach,shops | 11059668 | Sophie | Waverley | -33.895820 | 151.245490 | Entire home/apt | 650.0 | 90 | 1 | ... | 2.0 | 1 | ['email', 'phone'] | 3.0 | 3.0 | https://www.airbnb.com/users/show/11059668 | 90 | 1125 | https://www.airbnb.com/rooms/2167538 | Queens Park is a safe and friendly neighbourho... |
| 790607305500629830 | Luxury double room | 202112801 | Vivian | Canada Bay | -33.851626 | 151.086324 | Private room | 84.0 | 1 | 31 | ... | 1.0 | 5 | ['email', 'phone'] | 1.0 | 1.0 | https://www.airbnb.com/users/show/202112801 | 90 | 365 | https://www.airbnb.com/rooms/790607305500629830 | NaN |
| 641106357139201992 | Chic Studio Apartment in Prime Bondi Location | 279001183 | MadeComfy | Waverley | -33.890550 | 151.275230 | Entire home/apt | 508.0 | 1 | 17 | ... | 1.0 | 217 | ['email', 'phone', 'work_email'] | 1.0 | 1.0 | https://www.airbnb.com/users/show/279001183 | 0 | 365 | https://www.airbnb.com/rooms/641106357139201992 | Bondi is one of Australia’s most iconic suburb... |
5 rows × 74 columns
listings.accommodates.head()
id 1050164598350388320 2 1096082243192434668 4 2167538 5 790607305500629830 1 641106357139201992 2 Name: accommodates, dtype: int64
listings['host_identity_verified'] = listings['host_identity_verified'].replace({'t': 'verified', 'f': 'unverified'})
- Top 10 Neighbourhoods for Airbnb
feq = listings['neighbourhood'].value_counts().sort_values(ascending=True)
top_10_feq = feq.tail(10) # Get the top 10 neighborhoods with the most listings
total_listings_by_neighborhood = listings.groupby('neighbourhood')['calculated_host_listings_count'].count() # Count the total occurrences of 'id' by neighborhood
fig, ax = plt.subplots(figsize=(8, 5))
top_10_feq.plot.barh(ax=ax, color='b', width=0.8)
# Add labels to the right of the bars
for i, v in enumerate(top_10_feq):
neighborhood_name = top_10_feq.index[i] # Get neighborhood name
total_listings = total_listings_by_neighborhood.loc[neighborhood_name]
ax.text(v + 0.1, i, f"{total_listings:,.0f}", va='center') # Display total listings
plt.title("Number of Listings and Total Listing IDs by Neighbourhood (Top 10)", fontsize=16)
plt.xlabel('Number of Listings', fontsize=12)
plt.ylabel('Neighbourhood', fontsize=12)
plt.tight_layout()
plt.show()
listings.room_type.count()
14274
unique_host_count = listings['host_name'].nunique()
print( unique_host_count)
3293
ken_listings_count = listings[listings['host_name'] == 'Ken']['room_type'].count()
print(ken_listings_count)
157
Daily Average Price in top 10 Neighborhoods
top_neighbourhoods = top_10_feq.index
filtered_listings = listings[listings['neighbourhood'].isin(top_neighbourhoods)]
feq = filtered_listings[filtered_listings['accommodates']==2]
feq = feq.groupby('neighbourhood')['price'].mean().sort_values(ascending=True)
feq.plot.barh(figsize=(10, 8), color='#0066CC', width=0.7)
plt.title("Average daily price for a 2-persons accommodation", fontsize=18)
plt.xlabel('Average daily price (Euro)', fontsize=8)
plt.ylabel("")
plt.tick_params(axis='y', labelsize=8)
for index, value in enumerate(feq):
plt.text(value, index, f"${value:.2f}", va='center', color='black', fontsize=8)
plt.show()
#Heatmap of the price
Sydney is the most crowded neighbourhood with around 3,000 properties for the Airbnb purposes
feq = listings['neighbourhood'].value_counts().sort_values(ascending=True)
top_10_feq = feq.head(10)
total_value_by_neighborhood = listings.groupby('neighbourhood')['price'].sum()
fig, ax = plt.subplots(figsize=(8, 5))
top_10_feq.plot.barh(ax=ax, color='b', width=0.8)
# Add labels to the right of the bars
for i, v in enumerate(top_10_feq):
neighborhood_name = top_10_feq.index[i] # Get neighborhood name
total_value = total_value_by_neighborhood.loc[neighborhood_name]
ax.text(v + 0.1, i, f"${total_value:,.0f}", va='center') # Adjust spacing and formatting
plt.title("Total Expected Income by Neighbourhood (Top 10)", fontsize=16)
plt.xlabel('Number of listings', fontsize=12)
plt.ylabel('Total Value', fontsize=12)
plt.tight_layout()
plt.show()
TOP 10 Neighbourhoods with the most income
# For example, if you want to filter based on the 'availability_365' column:
unavailable_listings = listings[listings['availability_30'] == 0]
# Count the number of listings by neighbourhood
feq = unavailable_listings['neighbourhood'].value_counts().sort_values(ascending=False)
top_10_feq = feq.head(10)
# Calculate total value by neighbourhood using the filtered data
total_value_by_neighborhood = unavailable_listings.groupby('neighbourhood')['price'].sum()
# Plotting
fig, ax = plt.subplots(figsize=(8, 5))
top_10_feq.plot.barh(ax=ax, color='b', width=0.8)
# Add labels to the right of the bars
for i, v in enumerate(top_10_feq):
neighborhood_name = top_10_feq.index[i] # Get neighbourhood name
total_value = total_value_by_neighborhood.loc[neighborhood_name]
ax.text(v + 0.1, i, f"${total_value:,.0f}", va='center') # Adjust spacing and formatting
plt.title("Number of Listings and Total Value by Neighbourhood (Top 10)", fontsize=16)
plt.xlabel('Number of Listings', fontsize=12)
plt.ylabel('Total Value', fontsize=12)
plt.tight_layout()
plt.show()
The Neighbourhood Map
lats = tourist['Latitude'].tolist()
lons = tourist['Longitude'].tolist()
tp = tourist['Tourist Place'].tolist()
map1 = folium.Map(location=[-33.90568, 151.2093], zoom_start=11.5)
marker_cluster = MarkerCluster().add_to(map1)
for lat, lon, tp in zip(lats, lons, tp):
popup_content = f"Tourist Place: {tp}"
popup = folium.Popup(popup_content, max_width=300)
folium.Marker(location=[lat, lon], popup=popup).add_to(marker_cluster)
map1
the original code with the explanation- import matplotlib.pyplot as plt
feq = listings['neighbourhood'].value_counts().sort_values(ascending=True) top_10_feq = feq.head(10)
fig, ax = plt.subplots(figsize=(8, 5))
top_10_feq.plot.barh(ax=ax, color='b', width=0.8)
for i, v in enumerate(top_10_feq): neighborhood_name = top_10_feq.index[i] # Get neighborhood name total_value = total_value_by_neighborhood.loc[neighborhood_name] ax.text(v + 0.1, i, f"${total_value:,.0f}", va='center') # Adjust spacing and formatting
plt.title("Number of listings and Total Value by Neighbourhood (Top 10)", fontsize=16) plt.xlabel('Number of listings', fontsize=12) plt.ylabel('Total Value', fontsize=12) plt.tight_layout() plt.show()
lats = listings['latitude'].tolist()
lons = listings['longitude'].tolist()
host_names = listings['host_name'].tolist()
room_types = listings['room_type'].tolist()
licenses = listings['license'].tolist()
verify = listings['host_identity_verified'].tolist()
review_scores_rating = listings['review_scores_rating'].fillna(0).tolist() # Assuming you want to fill NA/NaN values with 0
map1 = folium.Map(location=[-33.90568, 151.2093], zoom_start=11.5)
marker_cluster = MarkerCluster().add_to(map1)
for lat, lon, host_name, room_type, license, verify, rpm in zip(lats, lons, host_names, room_types, licenses, verify, review_scores_rating):
popup_content = f"Host Name: {host_name}<br>Room Type: {room_type}<br>License: {license}<br>Verified Host: {verify}<br>Reviews per Month: {rpm}"
popup = folium.Popup(popup_content, max_width=300)
folium.Marker(location=[lat, lon], popup=popup).add_to(marker_cluster)
map1